data <- read_csv("data.csv")
head(data)
## # A tibble: 6 Ă— 11
##    Year State_Abbr State      Binge_Drinking_Prevalence Legalized Bachelors_Rate
##   <dbl> <chr>      <chr>                          <dbl>     <dbl>          <dbl>
## 1  2011 AL         Alabama                         13.7         0           22.3
## 2  2011 AK         Alaska                          20.8         1           26.4
## 3  2011 AZ         Arizona                         17.6         0           26.6
## 4  2011 AR         Arkansas                        14.1         0           20.3
## 5  2011 CA         California                      18.6         1           30.3
## 6  2011 CO         Colorado                        20.1         1           36.7
## # ℹ 5 more variables: Median_Age <dbl>, Urbanization_Rate <dbl>,
## #   Legalization_Year <dbl>, G <dbl>, State_ID <dbl>
summary(data)
##       Year       State_Abbr           State           Binge_Drinking_Prevalence
##  Min.   :2011   Length:659         Length:659         Min.   : 9.60            
##  1st Qu.:2014   Class :character   Class :character   1st Qu.:14.79            
##  Median :2017   Mode  :character   Mode  :character   Median :16.50            
##  Mean   :2017                                         Mean   :16.71            
##  3rd Qu.:2020                                         3rd Qu.:18.30            
##  Max.   :2023                                         Max.   :27.20            
##                                                                                
##    Legalized      Bachelors_Rate   Median_Age    Urbanization_Rate
##  Min.   :0.0000   Min.   :18.5   Min.   :29.60   Min.   :0.3117   
##  1st Qu.:0.0000   1st Qu.:27.2   1st Qu.:37.05   1st Qu.:0.6161   
##  Median :0.0000   Median :30.9   Median :38.40   Median :0.7209   
##  Mean   :0.1973   Mean   :31.8   Mean   :38.42   Mean   :0.7224   
##  3rd Qu.:0.0000   3rd Qu.:35.4   3rd Qu.:39.65   3rd Qu.:0.8551   
##  Max.   :1.0000   Max.   :65.9   Max.   :45.10   Max.   :1.0000   
##                                                                   
##  Legalization_Year       G             State_ID    
##  Min.   :2012      Min.   :   0.0   Min.   : 1.00  
##  1st Qu.:2014      1st Qu.:   0.0   1st Qu.:13.00  
##  Median :2015      Median :   0.0   Median :26.00  
##  Mean   :2015      Mean   : 397.4   Mean   :26.01  
##  3rd Qu.:2016      3rd Qu.:   0.0   3rd Qu.:39.00  
##  Max.   :2016      Max.   :2016.0   Max.   :51.00  
##  NA's   :529
data <- data %>%
  mutate(
    Post = ifelse(!is.na(Legalization_Year) & Year >= Legalization_Year, 1, 0)
  )

did_model <- feols(
  Binge_Drinking_Prevalence ~ i(Post, Legalized, ref = 0) + Bachelors_Rate + Median_Age + Urbanization_Rate | State + Year,
  data = data
)

summary(did_model)
## OLS estimation, Dep. Var.: Binge_Drinking_Prevalence
## Observations: 659
## Fixed-effects: State: 51,  Year: 13
## Standard-errors: Clustered (State) 
##                   Estimate Std. Error  t value Pr(>|t|) 
## Post::1:Legalized 0.135629   0.345874 0.392136  0.69662 
## Bachelors_Rate    0.032055   0.109348 0.293149  0.77062 
## Median_Age        0.059238   0.243106 0.243672  0.80848 
## Urbanization_Rate 6.496083  22.503565 0.288669  0.77403 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.04364     Adj. R2: 0.86782 
##                 Within R2: 0.002049
modelsummary(did_model,
             stars = TRUE,
             fmt = 3,
             estimate = "{estimate} ({std.error})",
             statistic = NULL,
             title = "Difference-in-Differences Regression Results",
             output = "markdown")
Difference-in-Differences Regression Results
(1)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
Post = 1 Ă— Legalized 0.136 (0.346)
Bachelors_Rate 0.032 (0.109)
Median_Age 0.059 (0.243)
Urbanization_Rate 6.496 (22.504)
Num.Obs. 659
R2 0.881
R2 Adj. 0.868
R2 Within 0.002
R2 Within Adj. -0.005
AIC 2060.5
BIC 2361.3
RMSE 1.04
Std.Errors by: State
FE: State X
FE: Year X
ggplot(data, aes(x = Year, y = Binge_Drinking_Prevalence, group = State, color = as.factor(Legalized))) +
  geom_line(alpha = 0.5, size = 0.4) +
  scale_color_manual(
    values = c("0" = "darkgray", "1" = "darkgreen"),
    labels = c("0" = "Not Legalized", "1" = "Legalized"),
    name = "Legal Status"
  ) +
  labs(
    title = "Binge Drinking Prevalence Over Time by State",
    subtitle = "Lines represent individual states; color indicates legalization status",
    x = "Year",
    y = "Binge Drinking Prevalence (%)"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Create average trends by treatment status
avg_trends <- data %>%
  group_by(Year, Legalized) %>%
  summarize(
    avg_binge = mean(Binge_Drinking_Prevalence, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    Legalized = ifelse(Legalized == 1, "Treated (Legalized)", "Control (Not Legalized)")
  )

# Plot
ggplot(avg_trends, aes(x = Year, y = avg_binge, color = Legalized)) +
  geom_line(size = 1.1) +
  geom_point(size = 2) +
  scale_color_manual(
    values = c("Treated (Legalized)" = "darkgreen", "Control (Not Legalized)" = "gray60")
  ) +
  labs(
    title = "Average Binge Drinking Prevalence Over Time",
    subtitle = "Comparing Treated vs. Control States",
    x = "Year",
    y = "Average Binge Drinking Prevalence (%)",
    color = "Group"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")

# Extract model estimates
did_df <- broom::tidy(did_model) %>%
  filter(str_detect(term, "Post::1:Legalized")) %>%
  mutate(
    lower = estimate - 1.96 * std.error,
    upper = estimate + 1.96 * std.error
  )

# Plot
ggplot(did_df, aes(x = term, y = estimate)) +
  geom_point(size = 3, color = "darkred") +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.15, color = "darkred") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray60") +
  labs(
    title = "Estimated DiD Treatment Effect",
    subtitle = "Effect of Marijuana Legalization on Binge Drinking Prevalence",
    x = "",
    y = "Estimated Effect (percentage points)"
  ) +
  theme_minimal(base_size = 13)

# Treated states with cohort year
treated_data <- data %>%
  filter(!is.na(Legalization_Year)) %>%
  mutate(Cohort = as.factor(Legalization_Year))

# Control group
control_data <- data %>%
  filter(is.na(Legalization_Year)) %>%
  mutate(Cohort = "Control")

# Combine both groups
combined_data <- bind_rows(treated_data, control_data)

# Average binge drinking by group and year
avg_trends <- combined_data %>%
  group_by(Year, Cohort) %>%
  summarise(Avg_Binge = mean(Binge_Drinking_Prevalence, na.rm = TRUE), .groups = "drop")

# Get legalization years per cohort (excluding control)
cohort_lines <- treated_data %>%
  distinct(Cohort, Legalization_Year)

# Plot
ggplot(avg_trends, aes(x = Year, y = Avg_Binge, color = Cohort)) +
  geom_line(size = 1.1) +
  geom_point(size = 2) +
  # Add vertical line for each treated cohort
  geom_vline(data = cohort_lines, aes(xintercept = Legalization_Year, color = Cohort),
             linetype = "dashed", show.legend = FALSE) +
  labs(
    title = "Binge Drinking Trends by Legalization Cohort and Control Group",
    subtitle = "Dashed lines mark legalization years for each cohort",
    x = "Year",
    y = "Avg. Binge Drinking Prevalence (%)",
    color = "Group"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")

# Group treated states by their cohort
cohort_states <- data %>%
  filter(!is.na(Legalization_Year)) %>%
  distinct(State, State_Abbr, Legalization_Year) %>%
  mutate(Cohort = as.factor(Legalization_Year)) %>%
  arrange(Cohort)

control_states <- data %>%
  filter(is.na(Legalization_Year)) %>%
  distinct(State, State_Abbr) %>%
  mutate(Cohort = "Control") %>%
  arrange(State)

# Combine treated and control states
state_cohorts <- bind_rows(cohort_states, control_states)

# View the result
print(state_cohorts)
## # A tibble: 51 Ă— 4
##    State                State_Abbr Legalization_Year Cohort
##    <chr>                <chr>                  <dbl> <chr> 
##  1 Colorado             CO                      2012 2012  
##  2 Washington           WA                      2012 2012  
##  3 Alaska               AK                      2014 2014  
##  4 District of Columbia DC                      2014 2014  
##  5 Oregon               OR                      2014 2014  
##  6 California           CA                      2016 2016  
##  7 Maine                ME                      2016 2016  
##  8 Massachusetts        MA                      2016 2016  
##  9 Nevada               NV                      2016 2016  
## 10 Vermont              VT                      2016 2016  
## # ℹ 41 more rows
#More detail on plot 5

# Label treated states with cohort (legalization year)
treated_data <- data %>%
  filter(!is.na(Legalization_Year)) %>%
  mutate(Cohort = as.factor(Legalization_Year))

# Create average line for control group
control_avg <- data %>%
  filter(is.na(Legalization_Year)) %>%
  group_by(Year) %>%
  summarise(
    Avg_Binge = mean(Binge_Drinking_Prevalence, na.rm = TRUE),
    .groups = "drop"
  )

# Plot
ggplot() +
  # Plot individual treated states, color-coded by cohort
  geom_line(data = treated_data,
            aes(x = Year, y = Binge_Drinking_Prevalence, group = State, color = Cohort),
            alpha = 0.7, size = 0.5) +

  # Add control group average line
  geom_line(data = control_avg,
            aes(x = Year, y = Avg_Binge),
            color = "grey60", linetype = "solid", size = 1) +

  labs(
    title = "Treated States Colored by Cohort; Control Group Averaged",
    subtitle = "Each treated state shown individually; control group shown as single average line",
    x = "Year",
    y = "Binge Drinking Prevalence (%)",
    color = "Legalization Cohort"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom")

OLD CODE

# Extract regression summary as a tidy data frame
library(broom)

# Convert model summary to a data frame
df <- tidy(did_model)

# Add confidence intervals
df <- df %>%
  mutate(
    lower = estimate - 1.96 * std.error,
    upper = estimate + 1.96 * std.error
  )

# Plot
ggplot(df, aes(x = term, y = estimate)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
  coord_flip() +
  labs(
    title = "Coefficient Plot",
    x = "Covariates",
    y = "Estimate (with 95% CI)"
  ) +
  theme_minimal(base_size = 14)

library(ggplot2)

ggplot(data, aes(x = Year, y = Binge_Drinking_Prevalence, group = State, color = as.factor(Legalized))) +
  geom_line(alpha = 0.6) +
  labs(
    title = "Binge Drinking Prevalence Over Time by State",
    x = "Year",
    y = "Binge Drinking Prevalence (%)",
    color = "Legalized"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

data_aligned <- data %>%
  filter(!is.na(Legalization_Year)) %>%
  mutate(relative_year = Year - Legalization_Year)

ggplot(data_aligned, aes(x = relative_year, y = Binge_Drinking_Prevalence, group = State)) +
  geom_line(alpha = 0.3, color = "gray") +
  stat_summary(fun = mean, geom = "line", color = "orange", linewidth = .8) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Binge Drinking Trends Aligned to Legalization Year",
    x = "Years Relative to Legalization",
    y = "Binge Drinking Prevalence (%)"
  ) +
  theme_minimal()

data_normalized <- data %>%
  filter(!is.na(Legalization_Year)) %>%
  mutate(relative_year = Year - Legalization_Year) %>%
  group_by(State) %>%
  mutate(
    baseline = Binge_Drinking_Prevalence[relative_year == 0],
    normalized_prevalence = Binge_Drinking_Prevalence - baseline
  ) %>%
  ungroup()

ggplot(data_normalized, aes(x = relative_year, y = normalized_prevalence, group = State)) +
  geom_line(alpha = 0.3, color = "gray") +
  stat_summary(fun = mean, geom = "line", color = "orange", size = .5) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Normalized Binge Drinking Trends (Aligned to Legalization Year)",
    x = "Years Relative to Legalization",
    y = "Change in Binge Drinking Prevalence (%)"
  ) +
  theme_minimal()

data_control <- data %>%
  filter(Legalized == 0) %>%
  mutate(relative_year = Year - 2016) %>%
  group_by(State) %>%
  mutate(
    baseline = Binge_Drinking_Prevalence[Year == 2016],
    normalized_prevalence = Binge_Drinking_Prevalence - baseline
  ) %>%
  ungroup()

ggplot(data_control, aes(x = relative_year, y = normalized_prevalence, group = State)) +
  geom_line(alpha = 0.3, color = "gray") +
  stat_summary(fun = mean, geom = "line", color = "orange", size = .3) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Normalized Binge Drinking Trends (Control States Aligned at 2016)",
    x = "Years Relative to 2016 (Placebo Treatment Year)",
    y = "Change in Binge Drinking Prevalence (%)"
  ) +
  theme_minimal()

# Add group label to each dataset
data_treated <- data %>%
  filter(!is.na(Legalization_Year)) %>%
  mutate(
    relative_year = Year - Legalization_Year,
    Group = "Treated"
  ) %>%
  group_by(State) %>%
  mutate(
    baseline = Binge_Drinking_Prevalence[relative_year == 0],
    normalized_prevalence = Binge_Drinking_Prevalence - baseline
  ) %>%
  ungroup()

data_control <- data %>%
  filter(Legalized == 0) %>%
  mutate(
    relative_year = Year - 2016,
    Group = "Control"
  ) %>%
  group_by(State) %>%
  mutate(
    baseline = Binge_Drinking_Prevalence[Year == 2016],
    normalized_prevalence = Binge_Drinking_Prevalence - baseline
  ) %>%
  ungroup()

# Combine both datasets
data_combined <- bind_rows(data_treated, data_control)

# Plot
ggplot(data_combined, aes(x = relative_year, y = normalized_prevalence, group = State)) +
  geom_line(alpha = 0.15, color = "gray") +
  stat_summary(aes(color = Group), fun = mean, geom = "line", size = 0.3) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Normalized Binge Drinking Trends: Treated vs. Control (Aligned by Year)",
    x = "Years Relative to Treatment or 2016",
    y = "Change in Binge Drinking Prevalence (%)",
    color = "Group"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

states_il <- c("IL", "NE", "KS", "IA", "AR", "OK")

did_data_il <- data %>%
  filter(State_Abbr %in% states_il)

# Illinois legalized in 2020
did_data_il <- did_data_il %>%
  mutate(
    Treated = ifelse(State_Abbr == "IL", 1, 0),
    Post = ifelse(Year >= 2020, 1, 0),
    DiD = Treated * Post
  )

did_model_il <- lm(
  Binge_Drinking_Prevalence ~ DiD + Bachelors_Rate + Median_Age + Urbanization_Rate,
  data = did_data_il
)

summary(did_model_il)
## 
## Call:
## lm(formula = Binge_Drinking_Prevalence ~ DiD + Bachelors_Rate + 
##     Median_Age + Urbanization_Rate, data = did_data_il)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7383 -1.8711 -0.7476  2.4484  6.3249 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -44.5560    15.8670  -2.808  0.00639 ** 
## DiD                -8.2494     1.7921  -4.603 1.72e-05 ***
## Bachelors_Rate      0.2303     0.1186   1.942  0.05595 .  
## Median_Age          1.2310     0.4023   3.060  0.00309 ** 
## Urbanization_Rate  13.8685     5.7830   2.398  0.01904 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.8 on 73 degrees of freedom
## Multiple R-squared:  0.3843, Adjusted R-squared:  0.3506 
## F-statistic: 11.39 on 4 and 73 DF,  p-value: 3.084e-07
library(broom)

tidy(did_model_il) %>%
  filter(term == "DiD") %>%
  ggplot(aes(x = term, y = estimate)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = estimate - 1.96 * std.error,
                    ymax = estimate + 1.96 * std.error), width = 0.2) +
  labs(
    title = "DiD Estimate: Illinois vs. Border States",
    x = "",
    y = "Estimated Effect on Binge Drinking (%)"
  ) +
  theme_minimal()

# Filter and label data
did_data_il <- data %>%
  filter(State_Abbr %in% states_il) %>%
  mutate(Group = ifelse(State_Abbr == "IL", "Illinois (Treated)", "Control States"))

# Summarize by year and group
avg_trend <- did_data_il %>%
  group_by(Year, Group) %>%
  summarise(mean_prevalence = mean(Binge_Drinking_Prevalence, na.rm = TRUE), .groups = "drop")

# Plot
ggplot(avg_trend, aes(x = Year, y = mean_prevalence, color = Group)) +
  geom_line(size = 1.1) +
  geom_vline(xintercept = 2020, linetype = "dashed", color = "red") +
  labs(
    title = "Binge Drinking Prevalence Over Time",
    subtitle = "Illinois (Treated) vs. Border States (Controls)",
    x = "Year",
    y = "Avg. Binge Drinking Prevalence (%)",
    color = "Group"
  ) +
  theme_minimal(base_size = 14)

```